library(tidyverse) # master library to deal with data frames
library(readxl) # read xlsx or xls files
library(ggrepel) # ggplot add-on, to plot names that don't collapse in same position
library(FactoMineR) # for PCA
library(factoextra) # for PCA
library(here) # usefull to save plots in folders given a root
library(viridis) # color palette package
library(plotly)

Some tests with 3D PCA plots and drugbank database

I’ve downloaded the files from the Zimmermann paper from Nature, 2019. They have a big database of 2099 compounds from drugbank to which they run a Python script to extract the chemical fingerprints, using them to represent the chemical space via a PCA.

# biolog metabolites
biolog = read_csv('Biolog_metabolites.csv')

# drugbank coordinates
drugbank.coord = read.delim('.\\Data_drug_bacteria_gene_mapping\\Input\\drugbank_pca.txt', header = FALSE)
drugbank = read_csv('.\\Data_drug_bacteria_gene_mapping\\Input\\drugbank_approved_MW_150_1000_functional_groups_all.csv') %>% select(-Number_of_carboxylic_acids_1)

# plates to use from Biolog
plates = c("PM11C", "PM12B", "PM13B", "PM14A", "PM15B", "PM16A", "PM17A", "PM18C", "PM19" , "PM20B")
biolog.drug = biolog %>% filter(Plate %in% plates)
# list of drugs present in biolog plates
drugs = biolog.drug %>% select(Metabolite) %>% unique %>% t %>% as.character

Just a glimpse of the drug names:

head(drugs)
## [1] "Amikacin"          "Chlortetracycline" "Lincomycin"       
## [4] "Amoxicillin"       "Cloxacillin"       "Lomefloxacin"

And a glimpse of the huge data table from the paper:

head(drugbank)
## # A tibble: 6 x 127
##      X1 index ALOGPS_LOGP ALOGPS_LOGS ALOGPS_SOLUBILI~ DATABASE_ID DATABASE_NAME
##   <dbl> <dbl>       <dbl>       <dbl> <chr>            <chr>       <chr>        
## 1     0    10       -0.55       -1.64 5.70e+00 g/l     DB00114     drugbank     
## 2     1    13       -3.09       -0.4  6.20e+01 g/l     DB00117     drugbank     
## 3     2    14        0.05       -2.45 1.62e+00 g/l     DB00118     drugbank     
## 4     3    16       -1.35       -1.6  4.14e+00 g/l     DB00120     drugbank     
## 5     4    17        0.17       -2.3  1.22e+00 g/l     DB00121     drugbank     
## 6     5    20       -3.49       -1.88 2.28e+00 g/l     DB00125     drugbank     
## # ... with 120 more variables: DRUGBANK_ID <chr>, DRUG_GROUPS <chr>,
## #   EXACT_MASS <dbl>, FORMULA <chr>, GENERIC_NAME <chr>, ID <chr>,
## #   INCHI_IDENTIFIER <chr>, INCHI_KEY <chr>, INTERNATIONAL_BRANDS <chr>,
## #   JCHEM_ACCEPTOR_COUNT <dbl>, JCHEM_ATOM_COUNT <dbl>,
## #   JCHEM_AVERAGE_POLARIZABILITY <dbl>, JCHEM_BIOAVAILABILITY <dbl>,
## #   JCHEM_DONOR_COUNT <dbl>, JCHEM_FORMAL_CHARGE <dbl>,
## #   JCHEM_GHOSE_FILTER <dbl>, JCHEM_IUPAC <chr>, JCHEM_LOGP <dbl>,
## #   JCHEM_MDDR_LIKE_RULE <dbl>, JCHEM_NUMBER_OF_RINGS <dbl>,
## #   JCHEM_PHYSIOLOGICAL_CHARGE <dbl>, JCHEM_PKA <dbl>,
## #   JCHEM_PKA_STRONGEST_ACIDIC <dbl>, JCHEM_PKA_STRONGEST_BASIC <dbl>,
## #   JCHEM_POLAR_SURFACE_AREA <dbl>, JCHEM_REFRACTIVITY <dbl>,
## #   JCHEM_ROTATABLE_BOND_COUNT <dbl>, JCHEM_RULE_OF_FIVE <dbl>,
## #   JCHEM_TRADITIONAL_IUPAC <chr>, JCHEM_VEBER_RULE <dbl>,
## #   MOLECULAR_WEIGHT <dbl>, PRODUCTS <chr>, SALTS <chr>,
## #   SECONDARY_ACCESSION_NUMBERS <chr>, SMILES <chr>, SYNONYMS <chr>,
## #   Number_of_aliphatic_carboxylic_acids <dbl>,
## #   Number_of_aliphatic_hydroxyl_groups <dbl>,
## #   Number_of_aliphatic_hydroxyl_groups_excluding_tert_OH <dbl>,
## #   Number_of_N_functional_groups_attached_to_aromatics <dbl>,
## #   Number_of_Aromatic_carboxylic_acide <dbl>,
## #   Number_of_aromatic_nitrogens <dbl>, Number_of_aromatic_amines <dbl>,
## #   Number_of_aromatic_hydroxyl_groups <dbl>, Number_of_carboxylic_acids <dbl>,
## #   Number_of_carbonyl_O <dbl>, Number_of_carbonyl_O__excluding_COOH <dbl>,
## #   Number_of_thiocarbonyl <dbl>,
## #   Number_of_C_OH_CCN_Ctert_alkyl_or_C_OH_CCNcyclic <dbl>,
## #   Number_of_Imines <dbl>, Number_of_Tertiary_amines <dbl>,
## #   Number_of_Secondary_amines <dbl>, Number_of_Primary_amines <dbl>,
## #   Number_of_hydroxylamine_groups <dbl>, Number_of_XCCNR_groups <dbl>,
## #   Number_of_tert_alicyclic_amines__no_heteroatoms__not_quinine_like_bridged_N_ <dbl>,
## #   Number_of_H_pyrrole_nitrogens <dbl>, Number_of_thiol_groups <dbl>,
## #   Number_of_aldehydes <dbl>,
## #   Number_of_alkyl_carbamates__subject_to_hydrolysis_ <dbl>,
## #   Number_of_alkyl_halides <dbl>,
## #   Number_of_allylic_oxidation_sites_excluding_steroid_dienone <dbl>,
## #   Number_of_amides <dbl>, Number_of_amidine_groups <dbl>,
## #   Number_of_anilines <dbl>,
## #   Number_of_aryl_methyl_sites_for_hydroxylation <dbl>,
## #   Number_of_azide_groups <dbl>, Number_of_azo_groups <dbl>,
## #   Number_of_barbiturate_groups <dbl>, Number_of_benzene_rings <dbl>,
## #   Number_of_benzodiazepines_with_no_additional_fused_rings <dbl>,
## #   Bicyclic <dbl>, Number_of_diazo_groups <dbl>,
## #   Number_of_dihydropyridines <dbl>, Number_of_epoxide_rings <dbl>,
## #   Number_of_esters <dbl>, Number_of_ether_oxygens__including_phenoxy_ <dbl>,
## #   Number_of_furan_rings <dbl>, Number_of_guanidine_groups <dbl>,
## #   Number_of_halogens <dbl>, Number_of_hydrazine_groups <dbl>,
## #   Number_of_hydrazone_groups <dbl>, Number_of_imidazole_rings <dbl>,
## #   Number_of_imide_groups <dbl>, Number_of_isocyanates <dbl>,
## #   Number_of_isothiocyanates <dbl>, Number_of_ketones <dbl>,
## #   Number_of_ketones_excluding_diaryl__a_b_unsat__dienones__heteroatom_on_Calpha <dbl>,
## #   Number_of_beta_lactams <dbl>, Number_of_cyclic_esters__lactones_ <dbl>,
## #   Number_of_methoxy_groups__OCH3 <dbl>, Number_of_morpholine_rings <dbl>,
## #   Number_of_nitriles <dbl>, Number_of_nitro_groups <dbl>,
## #   Number_of_nitro_benzene_ring_substituents <dbl>,
## #   Number_of_non_ortho_nitro_benzene_ring_substituents <dbl>,
## #   Number_of_nitroso_groups__excluding_NO2 <dbl>,
## #   Number_of_oxazole_rings <dbl>, Number_of_oxime_groups <dbl>,
## #   Number_of_para_hydroxylation_sites <dbl>, ...

Doing some data transformations (including the names of biolog compounds and that into the huge data table), we notice one important thing: only ~70 compounds from biolog are in the original data table. Anyway, it’s a good practice to see if our drugs are occupying enough of the drug space.

drugbank.coord['Drugbank'] = drugbank$GENERIC_NAME

# create a new variable, just in case
test2 = drugbank.coord
# data transformation, create Category with Biolog (if the compound is Drugbank) and Drugbank (if not)
test2 = test2 %>%
  mutate(biolog = ifelse(Drugbank %in% drugs, Drugbank, NA),
         Category = ifelse(Drugbank %in% drugs, 'Biolog', 'Drugbank'),
         Category = as.factor(Category)) 

rownames(test2) = test2$Drugbank

head(test2)
##                         V1     V2    V3            Drugbank biolog Category
## Pyridoxal Phosphate -0.452 -0.801 0.028 Pyridoxal Phosphate   <NA> Drugbank
## Histidine           -0.566 -0.949 0.606           Histidine   <NA> Drugbank
## Ademetionine         0.375 -0.703 0.468        Ademetionine   <NA> Drugbank
## L-Phenylalanine     -0.982 -0.417 1.061     L-Phenylalanine   <NA> Drugbank
## Biotin               0.686 -0.869 0.437              Biotin   <NA> Drugbank
## L-Arginine          -0.376 -1.576 1.169          L-Arginine   <NA> Drugbank

And let’s plot it in 3D with Plotly:

fig = plot_ly(data = test2, x = ~V1, y = ~V2, z = ~V3, text = rownames(test2), 
               color = ~Category, colors = c('#FF0900', '#B8B8B8'),
               marker = list(size = 3))
fig
## No trace type specified:
##   Based on info supplied, a 'scatter3d' trace seems appropriate.
##   Read more about this trace type -> https://plot.ly/r/reference/#scatter3d
## No scatter3d mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plot.ly/r/reference/#scatter-mode

In order to try to complete our dataset with more drugs, I’ve extracted the KEGG IDs from the biolog table, passed them into a website that converts them into PubChem IDs, and got the chemical fingerprints using an adapted version of the Python script available at their GitHub repository.
This way, I’ve been able to get over a 100 new compounds from biolog that were not in the original data table, but there are still ~70 compounds that need to be added. And also metformin!! Probably will be a good idea to put a label to metformin to see where it falls.

# first, separate KEGG ids to generate a new column to the biolog data frame with pubchem ids
kegg_ids = biolog %>%
  select(KEGG_ID) %>% t %>% as.character %>% unique 
# remove NAs
kegg_ids = kegg_ids[complete.cases(kegg_ids)]
# write list of genes
write.table(kegg_ids, 'KEGG_IDs_biolog.txt', quote = F, col.names = F, row.names = F)
## go here to convert: http://csbg.cnb.csic.es/mbrole/conversion.jsp

# read generated list and merge with biolog
kegg2pub =  read.csv("D:/MRC_Postdoc/Pangenomic/Chem_space/KEGG2PubChemIDs.txt")
biolog = biolog %>% left_join(kegg2pub)


### let's get the missing drugs (as many as we can, at least)
missing = drugs[!drugs %in% unique(drugbank$GENERIC_NAME)]

drugs_missing = biolog %>%
  filter(Metabolite %in% missing) %>%
  select(PubChem_ID) %>% t %>% as.character %>% unique 
# remove NAs
drugs_missing = drugs_missing[complete.cases(drugs_missing)]
write.table(drugs_missing, 'PubChem_ID_missing_metabolites.txt', quote = F, col.names = F, row.names = F)


# after downloading them from pubchem, the file is named as: Metab_structures_missing_biolog.sdf

# file with chem fingerprints from pubchem compounds (~100 more, not bad)
biolog_metabolites_PubChem = read_csv("biolog_metabolites_PubChem.csv") %>% select(-X1, -index, -Number_of_carboxylic_acids_1, -ID) %>%
  rename(PubChem_ID = PUBCHEM_COMPOUND_CID) 


dummy = biolog %>%
  filter(Metabolite %in% missing) %>%
  select(Metabolite, PubChem_ID) %>% unique

biolog_metabolites_PubChem = biolog_metabolites_PubChem %>% left_join(dummy, by = 'PubChem_ID') %>%
  select(PubChem_ID, Metabolite, everything()) %>% rename(GENERIC_NAME = Metabolite)


# bind rows
complete = bind_rows(drugbank, biolog_metabolites_PubChem)

# save data table into csv
write.csv(complete, 'Complete_chem_fingerprints.csv')

With the new data, let’s proceed with the analysis.

  1. Do the PCA:
mat = complete %>% 
  filter(GENERIC_NAME != 'Tannic acid') %>%
  select(Number_of_aliphatic_carboxylic_acids:Number_of_urea_groups) %>% as.matrix
rownames(mat) = complete %>% filter(GENERIC_NAME != 'Tannic acid') %>% select(GENERIC_NAME) %>% t %>% as.character


res.pca = PCA((mat), scale.unit = TRUE, ncp = 5, graph = F)
ind = get_pca_ind(res.pca)
ind_df = data.frame(ind$coord[,1], ind$coord[,2], ind$coord[,3])
colnames(ind_df) = c('Dim1', 'Dim2', 'Dim3')
  1. Plot the data:
test2 = ind_df %>% tibble 
test2['Drugbank'] = complete %>% filter(GENERIC_NAME != 'Tannic acid') %>% select(GENERIC_NAME) %>% t %>% as.character

test2 = test2 %>%
  mutate(biolog = ifelse(Drugbank %in% drugs, Drugbank, NA),
         Category = ifelse(Drugbank %in% drugs, 'Biolog', 'Drugbank'),
         Category = as.factor(Category)) 


fig = plot_ly(data = test2,  x = ~Dim1, y = ~Dim2, z = ~Dim3, text = test2$Drugbank, 
               color = ~Category, colors = c('#FF0900', '#B8B8B8'),
               marker = list(size = 3))
fig
## No trace type specified:
##   Based on info supplied, a 'scatter3d' trace seems appropriate.
##   Read more about this trace type -> https://plot.ly/r/reference/#scatter3d
## No scatter3d mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plot.ly/r/reference/#scatter-mode

Notice that I was removing Tanic Acid from the plot because it’s a super different compound that gets extremely separated from the rest. In any case, there is a major difference between this plot and the previous one because the cloud of points have a different shape. I don’t think this is very important, but we need to have this in mind.

t-SNE representation

PCA is a good method to reduce the dimensionality, but it’s showing some weird results compared to the original publication. Before testing more things with the PCA parameters and transformations, let’s give it a try with the t-SNE method, which in theory is more robust to show non-linear dependencies and better clusters than a PCA.

library(Rtsne)

mat.tsne = complete %>% 
  select(Number_of_aliphatic_carboxylic_acids:Number_of_urea_groups) %>% as.matrix
rownames(mat.tsne) = complete %>% 
  select(GENERIC_NAME) %>% t %>% as.character

This time we have not removed Tanic Acid, so let’s see what happens with the clustering. First of all, let’s run the tsne method. If you want to know more about the paramters, especially the perplexity parameter, go to this website and play with the examples. The code might take a while. If you are running in a slower computer, reduce the max_iter parameter to 1000.

tsne = Rtsne(mat.tsne, check_duplicates = FALSE, pca = FALSE, num_threads = 12,
             normalize = FALSE, max_iter = 5000,
             perplexity = 40, theta = 0.2, dims = 3) 

And let’s plot what we have doing the proper data transformations:

# generate data frame from tnse results
tsne.df = data.frame(tsne$Y)
colnames(tsne.df) = c('Dim1', 'Dim2', 'Dim3')

tsne.df['Drugbank'] = complete %>% 
  # filter(GENERIC_NAME != 'Tannic acid') %>% 
  select(GENERIC_NAME) %>% t %>% as.character

tsne.df = tsne.df %>%
  mutate(biolog = ifelse(Drugbank %in% drugs, Drugbank, NA),
         Category = ifelse(Drugbank %in% drugs, 'Biolog', 'Drugbank'),
         Opacity = ifelse(Category == 'Biolog', 1, 0.1),
         Opacity = as.numeric(Opacity),
         Category = as.factor(Category)) 


fig = plot_ly(data = tsne.df,  x = ~Dim1, y = ~Dim2, z = ~Dim3, text = tsne.df$Drugbank, 
               color = ~Category, colors = c('#FF0900', '#B8B8B8'),
               marker = list(size = 4))
fig

Good! It’s much nicer than the PCA in my opinion. But there’s still the detail of the opacity for the grey points. I haven’t got a proper way to make it work with values within the dataframe, but we can make a slightly more convoluted code to plot it in a nicer way.

# let's try to modify the opacity of grey points
fig = plot_ly()

fig = fig %>%
  add_trace(
    data = tsne.df %>% filter(Category == 'Drugbank'),
    name = 'Drugbank',
    x = ~Dim1, y = ~Dim2, z = ~Dim3, text = tsne.df[tsne.df$Category == 'Drugbank',]$Drugbank, 
    marker = list(
      color = '#B8B8B8', 
      opacity = 0.4, # OPACITY
      size = 4)
    )

fig = fig %>%
  add_trace(
    data = tsne.df %>% filter(Category == 'Biolog'),
    name = 'Biolog',
    x = ~Dim1, y = ~Dim2, z = ~Dim3, text = tsne.df[tsne.df$Category == 'Biolog',]$Drugbank, 
    marker = list(
      color = '#FF0900', 
      size = 4)
  )
fig